!pip install numpy==1.23.3
Requirement already satisfied: numpy==1.23.3 in c:\users\herik\anaconda3\lib\site-packages (1.23.3)
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import plotly
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
import chart_studio
import chart_studio.plotly as py
import calmap
import datetime
import tensorflow as tf
import os
import random
import re
import plotly.offline as pyoff
import plotly.graph_objs as go
import swifter
from scipy import signal
from datetime import date
from plotly.subplots import make_subplots
from itertools import cycle, product
from statsmodels.tsa.seasonal import STL, seasonal_decompose
from scipy.stats import boxcox
from dateutil.parser import parse
from pmdarima.arima import auto_arima
from pmdarima.utils import diff_inv
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.layers import LSTM, Dense, BatchNormalization
from tensorflow.keras import Sequential
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.initializers import *
from tensorflow.keras import optimizers
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from scipy.special import boxcox1p, inv_boxcox1p
import matplotlib.patches as mpatches
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.model_selection import GridSearchCV
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.forecasting.stl import STLForecast
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.optimizers import Adam
from IPython.display import HTML, display
from swifter import set_defaults
from xgboost import XGBRegressor
from xgboost import plot_importance, plot_tree
C:\Users\herik\anaconda3\lib\site-packages\numpy\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:
C:\Users\herik\anaconda3\lib\site-packages\numpy\.libs\libopenblas.4SP5SUA7CBGXUEOC35YP2ASOICYYEQZZ.gfortran-win_amd64.dll
C:\Users\herik\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
warnings.warn("loaded more than 1 DLL from .libs:"
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Herikc Brecher" --iversions
Author: Herikc Brecher plotly : 5.6.0 keras : 2.10.0 calmap : 0.0.9 tensorflow : 2.10.0 seaborn : 0.11.2 scipy : 1.5.4 numpy : 1.20.0 matplotlib : 3.5.1 re : 2.2.1 pandas : 1.4.2 swifter : 1.3.4 chart_studio: 1.1.0
# Variaveis globais
SEED = 84796315
FEATURES = 7
EPOCHS = 1000
BATCH_SIZE = 1000
EXECUTE_GRID_SEARCH = False
# Configurando seeds
os.environ['PYTHONHASHSEED'] = str(SEED)
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
# Exibindo toda tela
display(HTML('<style>.container { width:100% !important; }</style>'))
pd.options.plotting.backend = 'matplotlib'
# Configurando swifter
set_defaults(
npartitions = None,
dask_threshold = 1,
scheduler = "processes",
progress_bar = True,
progress_bar_desc = None,
allow_dask_on_strings = True,
force_parallel = True,
)
# Import dataset
dtOrders = pd.read_csv('../data/olist_orders_dataset.csv', encoding = 'utf8')
# Colunas do tipo data
dateColumns = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',\
'order_delivered_customer_date', 'order_estimated_delivery_date']
# Dataset de analise temporal
dtOrdersAdjusted = dtOrders.copy()
# Convertendo columas de data para date
for col in dateColumns:
dtOrdersAdjusted[col] = pd.to_datetime(dtOrdersAdjusted[col], format = '%Y-%m-%d %H:%M:%S')
# Dropando valores NA
dtOrdersAdjusted = dtOrdersAdjusted.dropna()
dtOrdersAdjusted.dtypes
order_id object customer_id object order_status object order_purchase_timestamp datetime64[ns] order_approved_at datetime64[ns] order_delivered_carrier_date datetime64[ns] order_delivered_customer_date datetime64[ns] order_estimated_delivery_date datetime64[ns] dtype: object
dtHistory = pd.to_datetime(dtOrdersAdjusted['order_purchase_timestamp']).dt.date
start = dtHistory.min()
end = dtHistory.max()
idx = pd.date_range(start, end, normalize = True)
seriesOriginal = dtHistory.value_counts(sort = False).sort_index().reindex(idx, fill_value = 0)
dtHistory = pd.DataFrame(seriesOriginal).reset_index()
Principais outliers identificados:
dtHistory.rename(columns = {'index': 'Data', 'order_purchase_timestamp': 'Vendas'}, inplace = True)
dtHistory
| Data | Vendas | |
|---|---|---|
| 0 | 2016-09-15 | 1 |
| 1 | 2016-09-16 | 0 |
| 2 | 2016-09-17 | 0 |
| 3 | 2016-09-18 | 0 |
| 4 | 2016-09-19 | 0 |
| ... | ... | ... |
| 709 | 2018-08-25 | 69 |
| 710 | 2018-08-26 | 73 |
| 711 | 2018-08-27 | 66 |
| 712 | 2018-08-28 | 39 |
| 713 | 2018-08-29 | 11 |
714 rows × 2 columns
# Plot
# Definição dos dados no plot (Iniciando em Fevereiro de 2017 para não destorcer os dados)
plot_data = [go.Scatter(x = dtHistory['Data'],
y = dtHistory['Vendas'])]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Periodo'},
yaxis = {'title': 'Vendas'},
title = 'Vendas por dia')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.update_layout(yaxis_range = [0, 800])
pyoff.iplot(fig)
# Remove outliers
seriesOriginal = seriesOriginal[datetime.date(2017, 1, 1): datetime.date(2018, 8, 17)]
pred_range = pd.date_range(datetime.date(2018, 8, 17), datetime.date(2018, 10, 17))
dtHistory = pd.DataFrame(seriesOriginal).reset_index()
dtHistory.rename(columns = {'index': 'Data', 'order_purchase_timestamp': 'Vendas'}, inplace = True)
# Plot
# Definição dos dados no plot (Iniciando em Fevereiro de 2017 para não destorcer os dados)
plot_data = [go.Scatter(x = dtHistory['Data'],
y = dtHistory['Vendas'])]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Periodo'},
yaxis = {'title': 'Vendas'},
title = 'Vendas por Dia')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.update_layout(yaxis_range = [0, 800])
pyoff.iplot(fig)
# Plot
# Definição dos dados no plot (Iniciando em Fevereiro de 2017 para não destorcer os dados)
plot_data = [go.Scatter(x = dtHistory['Data'],
y = dtHistory['Vendas'],
showlegend = False),
go.Scatter(x = dtHistory['Data'],
y = -dtHistory['Vendas'],
fill='tonexty',
showlegend = False)]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Periodo'},
yaxis = {'title': 'Vendas'},
title = 'Vendas por Dia Expandida')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.update_traces(line_color = 'green')
fig.update_layout(yaxis_range = [-800 , 800])
pyoff.iplot(fig)
#Plot histórico de vendas por dia, mês e ano
fig, caxs = calmap.calendarplot(seriesOriginal, daylabels = 'MTWTFSS', fillcolor = 'grey',cmap = 'YlGn', fig_kws = dict(figsize = (18, 9)))
fig.suptitle('Histórico de Vendas', fontsize = 22)
fig.subplots_adjust(right = 0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.03, 0.67])
fig.colorbar(caxs[0].get_children()[1], cax = cbar_ax)
plt.show()
# Criar grafico na estrutura STL 4 layers
def add_stl_plot(fig, res, legend):
axs = fig.get_axes()
# Nome de cada um dos subplots
comps = ['trend', 'seasonal', 'resid']
for ax, comp in zip(axs[1:], comps):
series = getattr(res, comp)
if comp == 'resid':
ax.plot(series, marker = 'o', linestyle = 'none')
else:
ax.plot(series)
ax.legend(legend, frameon = False)
# Gerar STL
stl = STL(seriesOriginal)
stl_res = stl.fit()
fig = stl_res.plot()
fig.set_size_inches((20, 12))
plt.show()
# Gerar STL não robusto e concatenar ao robusto
stl = STL(seriesOriginal, robust = True)
res_robust = stl.fit()
fig = res_robust.plot()
fig.set_size_inches((20, 12))
res_non_robust = STL(seriesOriginal, robust = False).fit()
add_stl_plot(fig, res_non_robust, ['Robusto', 'Não Robusto'])
# Additive Decomposition
additive_decomposition = seasonal_decompose(seriesOriginal, model = 'additive', period = FEATURES)
# Plot
additive_decomposition.plot()
plt.tight_layout(rect = [0, 0.03, 1, 0.95])
plt.rcParams["figure.figsize"] = (20, 10)
plt.show()
# Removendo tendência
detrended = signal.detrend(seriesOriginal)
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = seriesOriginal.index,
y = detrended)]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Vendas Diárias Removendo a Tendência')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.update_layout(yaxis_range = [-800 , 800])
pyoff.iplot(fig)
# Gerando STL para separar cada um dos componentes
stl = STL(seriesOriginal)
res = stl.fit()
# Separando seriesDeseasonal
seriesDeseasonal = res.observed - res.seasonal
# Separando boxcox
seriesBoxCox, lmbda = boxcox(seriesOriginal + 1)
seriesBoxCox = pd.Series(seriesBoxCox, index = seriesOriginal.index)
# Separando stationary
seriesResidual = seriesOriginal.diff(7).dropna()
Os testes abaixo concluiram:
O teste aceita a hipótese nula em que a série não é estácionária para os dados originais e deseasonal. Já para os dados residuais esses aceitaram a hipótese alternativa que os dados são estacionários.
ADF teste:
print("Os dados são estacionários?\n")
testResult = adfuller(seriesOriginal, autolag = 'AIC')
print("Valor Teste = {:.3f}".format(testResult[0]))
print("Valor de P: = {:.3f}".format(testResult[1]))
print("\nValores Críticos:")
for p, v in testResult[4].items():
print("\t{}: {} - O dataset {} é estacionário com {}% de confiança".format(p, v, "não" if v < testResult[0] else "", 100 - int(p[:-1])))
Os dados são estacionários? Valor Teste = -2.616 Valor de P: = 0.090 Valores Críticos: 1%: -3.441694608475642 - O dataset não é estacionário com 99% de confiança 5%: -2.866544718556839 - O dataset não é estacionário com 95% de confiança 10%: -2.5694353738653684 - O dataset é estacionário com 90% de confiança
print("Os dados deseasonal são estacionários?")
testResult = adfuller(seriesDeseasonal, autolag = 'AIC')
print("Valor Teste = {:.3f}".format(testResult[0]))
print("Valor de P: = {:.3f}".format(testResult[1]))
print("\nValores Críticos:")
for p, v in testResult[4].items():
print("\t{}: {} - O dataset {} é estacionário com {}% de confiança".format(p, v, "não" if v < testResult[0] else "", 100 - int(p[:-1])))
Os dados deseasonal são estacionários? Valor Teste = -2.536 Valor de P: = 0.107 Valores Críticos: 1%: -3.441694608475642 - O dataset não é estacionário com 99% de confiança 5%: -2.866544718556839 - O dataset não é estacionário com 95% de confiança 10%: -2.5694353738653684 - O dataset não é estacionário com 90% de confiança
print("Os dados residuais são estacionários?")
testResult = adfuller(seriesResidual, autolag = 'AIC')
print("Valor Teste = {:.3f}".format(testResult[0]))
print("Valor de P: = {:.3f}".format(testResult[1]))
print("\nValores Críticos:")
for p, v in testResult[4].items():
print("\t{}: {} - O dataset {} é estacionário com {}% de confiança".format(p, v, "não" if v < testResult[0] else "", 100 - int(p[:-1])))
Os dados residuais são estacionários? Valor Teste = -6.802 Valor de P: = 0.000 Valores Críticos: 1%: -3.441834071558759 - O dataset é estacionário com 99% de confiança 5%: -2.8666061267054626 - O dataset é estacionário com 95% de confiança 10%: -2.569468095872659 - O dataset é estacionário com 90% de confiança
Toda a etapa de modelagem será considerada com 5 passos a frente de previsão.
# Controle de resultados de toda fase de modelagem
result = pd.DataFrame(columns = ['Algorithm', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Mean_True_Value', 'Mean_Predict_Value'])
split_range = TimeSeriesSplit(n_splits = 8, max_train_size = pred_range.shape[0], test_size = pred_range.shape[0])
# Adiciona o registro ao dataset
def record(result, algorithm, mse = -1, rmse = -1, mae = -1, mape = -1, mrv = -1, mpv = -1, show = True):
new = pd.DataFrame(dict(Algorithm = algorithm, MSE = mse, RMSE = rmse, MAE = mae, MAPE = mape, Mean_True_Value = mrv,\
Mean_Predict_Value = mpv), index = [0])
result = pd.concat([result, new], ignore_index = True)
if show:
display(result)
return result
# Plot no formato de 4 layers, seguindo o STL para cada um dos modelos
def plot(index, pred, mse, title, fig = None, ax = None, ylabel = ''):
global seriesOriginal
empty_fig = fig is None
if empty_fig:
fig, ax = plt.subplots(figsize = (13, 6))
else:
ax.set_ylabel(ylabel)
ax.set_title(title)
patch_ = mpatches.Patch(color = 'white', label = f'MSE: {np.mean(mse):.1e}')
L1 = ax.legend(handles = [patch_], loc = 'upper left', fancybox = True, framealpha = 0.7, handlelength = 0)
ax.add_artist(L1)
sns.lineplot(x = seriesOriginal.index, y = seriesOriginal, label = 'Real', ax = ax)
sns.lineplot(x = index, y = pred, label = 'Previsto', ax = ax)
ax.axvline(x = index[0], color = 'red')
ax.legend(loc = 'upper right')
ax.set_ylim([0, 800])
if empty_fig:
plt.show()
else:
return fig
# Calculo para previsão e teste quando utilizado a série Original
def calcPredTestOriginal(train, pred, test):
return pred, test, 0
# Calculo para previsão e teste quando utilizado a série seriesDeseasonal
def calcPredTestseriesDeseasonal(train, pred, test):
# Removendo a sazonalidade da série e convertendo para o shape correto
last_seasonal = res.seasonal.reindex_like(train).tail(stl.period)
pred = pred + np.fromiter(cycle(last_seasonal), count = pred.shape[0], dtype = float)
test = test + res.seasonal.reindex_like(test)
return pred, test, 1
# Calculo para previsão e teste quando utilizado a série BoxCox
def calcPredTestBoxCox(train, pred, test):
# Reverdendo a normalização do boxcox
pred = inv_boxcox1p(pred, lmbda)
test = inv_boxcox1p(test, lmbda)
return pred, test, 2
# Calculo para previsão e teste quando utilizado a série Stationary
def calcPredTestStationary(train, pred, test):
# Calculando a diferença da sazonalidade
xi = seriesOriginal.reindex_like(train).tail(FEATURES)
totalLen = len(pred) + len(xi)
ix = pd.date_range(xi.index[0], periods = totalLen)
inv = diff_inv(pred, FEATURES, xi = xi) + np.fromiter(cycle(xi), count = totalLen, dtype = float)
inv = pd.Series(inv, index = ix, name = 'Vendas')
pred = inv.iloc[FEATURES:]
totalLen = len(test) + len(xi)
ix = pd.date_range(xi.index[0], periods = totalLen)
inv = diff_inv(test, FEATURES, xi = xi) + np.fromiter(cycle(xi), count = totalLen, dtype = float)
inv = pd.Series(inv, index = ix, name = 'Vendas')
test = inv.iloc[FEATURES:]
return pred, test, 3
def calculate_metrics_off(test, pred):
mse = mean_squared_error(test, pred, squared = True)
rmse = mean_squared_error(test, pred, squared = False)
mae = mean_absolute_error(test, pred)
mape = mean_absolute_percentage_error(test, pred) * 100
mtv = np.mean(test)
mpv = np.mean(pred)
return mse, rmse, mae, mape, mtv, mpv
def calculate_metrics_on(test, pred, mse, rmse, mae, mape, mtv, mpv):
mse_, rmse_, mae_, mape_, mtv_, mpv_ = calculate_metrics_off(test, pred)
mse.append(mse_)
rmse.append(rmse_)
mae.append(mae_)
mape.append(mape_)
mtv.append(mtv_)
mpv.append(mpv_)
return mse, rmse, mae, mape, mtv, mpv
# Report para Time Series Regressor, realiza o treino do modelo, adiciona aos resultados e faz o plot de acompanhamento
def reportTSR(data, modelName, calcFunction):
global result
global figs
mse = []
rmse = []
mae = []
mape = []
mtv = []
mpv = []
title = modelName + ' - Time Series Regression'
for train_id, test_id in split_range.split(data):
train, test = data.iloc[train_id], data.iloc[test_id]
gen = TimeseriesGenerator(train, train, FEATURES, batch_size = BATCH_SIZE)
X_train = gen[0][0]
y_train = gen[0][1]
lr = LinearRegression()
lr.fit(X_train, y_train)
X_pred = y_train[-FEATURES:].reshape(1,-1)
pred = np.empty(test.shape[0])
for i in range(len(pred)):
forecast = lr.predict(X_pred)
X_pred = np.delete(X_pred, 0, 1)
X_pred = np.concatenate((X_pred, forecast.reshape(-1, 1)), 1)
pred[i] = forecast
pred, test, indexPlot = calcFunction(train, pred, test)
mse, rmse, mae, mape, mtv, mpv = calculate_metrics_on(test, pred, mse, rmse, mae, mape, mtv, mpv)
result = record(result, title, np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(mape), np.mean(mtv), np.mean(mpv), False)
return plot(test.index, pred, mse, title, figs, axs[indexPlot], modelName)
# Reset da figura
figs, axs = plt.subplots(nrows = 4, sharex = True, figsize = (13,6))
figs.tight_layout()
plt.close()
reportTSR(seriesOriginal.copy(), 'Original', calcPredTestOriginal)
reportTSR(seriesDeseasonal.copy(), 'Deseasonal', calcPredTestseriesDeseasonal)
reportTSR(seriesBoxCox.copy(), 'BoxCox', calcPredTestBoxCox)
reportTSR(seriesResidual.copy(), 'Stationary', calcPredTestStationary)
result
| Algorithm | MSE | RMSE | MAE | MAPE | Mean_True_Value | Mean_Predict_Value | |
|---|---|---|---|---|---|---|---|
| 0 | Original - Time Series Regression | 5296.329792 | 60.784585 | 42.960631 | 25.553663 | 179.560484 | 168.008596 |
| 1 | Deseasonal - Time Series Regression | 5081.77776 | 58.208746 | 40.122247 | 23.488563 | 179.560484 | 164.670085 |
| 2 | BoxCox - Time Series Regression | 5328.602465 | 61.060147 | 43.254815 | 25.416806 | 179.560484 | 165.399796 |
| 3 | Stationary - Time Series Regression | 6057.828087 | 64.323443 | 46.538896 | 27.899678 | 179.560484 | 184.267746 |
# Função utilizada para o hypertuning de alpha, beta e gamma do Exponential Smoothing
def GSES(data, modelName, alpha, beta, gamma, damping_trend, calcFunction):
mse = []
for train_id, test_id in split_range.split(data):
train, test = data.iloc[train_id], data.iloc[test_id]
try:
with catch_warnings():
filterwarnings('ignore')
ES = (
ExponentialSmoothing(train, trend = 'add', seasonal = 'add', seasonal_periods = FEATURES, damped_trend = True)
.fit(smoothing_level = alpha, smoothing_trend = beta, smoothing_seasonal = gamma, method = 'ls', damping_trend = damping_trend)
)
pred = ES.forecast(test.shape[0])
pred, test, _ = calcFunction(train, pred, test)
mse.append(mean_squared_error(pred, test, squared = True))
except:
mse.append(-1)
return np.mean(mse)
# Função utilizada para o hypertuning de demais parâmetros do Exponential Smoothing
def GSESOPT(data, modelName, trend, season, periods, bias, method, calcFunction):
mse = []
for train_id, test_id in split_range.split(data):
train, test = data.iloc[train_id], data.iloc[test_id]
try:
with catch_warnings():
filterwarnings('ignore')
ES = (
ExponentialSmoothing(train, trend = trend, seasonal = season, seasonal_periods = periods)
.fit(remove_bias = bias, method = method, optimized = True)
)
pred = ES.forecast(test.shape[0])
pred, test, _ = calcFunction(train, pred, test)
mse.append(mean_squared_error(pred, test, squared = True))
except:
mse.append(-1)
return np.mean(mse)
# Report para Exponential Smoothing, realiza o treino do modelo, adiciona aos resultados e faz o plot de acompanhamento
def reportES(data, modelName, model_kwargs, fit_kwargs, calcFunction):
global result
global figs
mse = []
rmse = []
mae = []
mape = []
mtv = []
mpv = []
title = modelName + ' - Exponential Smoothing'
indexPlot = 0
for train_id, test_id in split_range.split(data):
train, test = data.iloc[train_id], data.iloc[test_id]
ES = (
ExponentialSmoothing(train, trend = model_kwargs['trend'], seasonal = model_kwargs['seasonal'], seasonal_periods = FEATURES, damped_trend = model_kwargs['damped_trend'])
.fit(smoothing_level = fit_kwargs['smoothing_level'], smoothing_trend = fit_kwargs['smoothing_trend'],\
smoothing_seasonal = fit_kwargs['smoothing_seasonal'], method = fit_kwargs['method'], damping_trend = fit_kwargs['damping_trend'])
)
pred = ES.forecast(test.shape[0])
pred, test, indexPlot = calcFunction(train, pred, test)
mse, rmse, mae, mape, mtv, mpv = calculate_metrics_on(test, pred, mse, rmse, mae, mape, mtv, mpv)
result = record(result, title, np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(mape), np.mean(mtv), np.mean(mpv), False)
return plot(test.index, pred, mse, title, figs, axs[indexPlot], modelName)
# Função para gerar tabela de hypertuning ampla
def exp_smoothing_configs(seasonal = [None]):
models = list()
# Lista de argumentos
t_params = ['add', 'mul']
s_params = ['add', 'mul']
p_params = seasonal
r_params = [True, False]
method_params = ['L-BFGS-B' , 'TNC', 'SLSQP', 'Powell', 'trust-constr', 'bh', 'ls']
# Gerando lista de argumentos
for t in t_params:
for s in s_params:
for p in p_params:
for r in r_params:
for m in method_params:
cfg = [t, s, p, r, m]
models.append(cfg)
return models
%%time
# Treinamento do modelo
if EXECUTE_GRID_SEARCH:
# Gerando tabela de hypertunning
alphas = betas = gammas = damping_trend = np.arange(1, step = 0.1)
hyperparam = pd.DataFrame(product(alphas, betas, gammas, damping_trend), columns = ['alpha', 'beta', 'gamma', 'damping_trend'])
hyperparam['mse'] = hyperparam.swifter.apply(lambda x: GSES(seriesOriginal.copy(), 'Original',\
x.alpha, x.beta, x.gamma, x.damping_trend, calcPredTestOriginal), axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
%%time
# Se True irá treinar com a nova lista mais ampla (pode demorar)
if EXECUTE_GRID_SEARCH:
# Criando lista de argumentos ampla
params_ = exp_smoothing_configs([FEATURES])
# Convertendo dicionário de argumentos para dataframe
hyperparam_ = pd.DataFrame(params_, columns = ['trend', 'season', 'periods', 'bias', 'method'])
# Executando hypertuning
hyperparam_['mse'] = hyperparam_.swifter.apply(lambda x: GSESOPT(seriesOriginal.copy(), 'Original',\
x.trend, x.season, x.periods, x.bias, x.method, calcPredTestOriginal),\
axis = 1)
# Busca em query pelos melhores argumentos
display(hyperparam_.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
# Reset da figura
figs, axs = plt.subplots(nrows = 4, sharex = True, figsize = (13, 6))
figs.align_ylabels()
figs.tight_layout()
plt.close()
model_kwargs = dict(trend = 'add', seasonal = 'add', seasonal_periods = FEATURES, damped_trend = True)
fit_kwargs = dict(smoothing_level = 0.1, smoothing_trend = 0.8, smoothing_seasonal = 0, method = 'ls', damping_trend = 0.8)
reportES(seriesOriginal.copy(), 'Original', model_kwargs, fit_kwargs, calcPredTestOriginal)
%%time
if EXECUTE_GRID_SEARCH:
alphas = betas = gammas = damping_trend = np.arange(1, step = 0.1)
hyperparam = pd.DataFrame(product(alphas, betas, gammas, damping_trend), columns = ['alpha', 'beta', 'gamma', 'damping_trend'])
hyperparam['mse'] = hyperparam.swifter.apply(lambda x: GSES(seriesDeseasonal.copy(), 'seriesDeseasonal',\
x.alpha, x.beta, x.gamma, x.damping_trend, calcPredTestseriesDeseasonal), axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
%%time
if EXECUTE_GRID_SEARCH:
params_ = exp_smoothing_configs([FEATURES])
hyperparam_ = pd.DataFrame(params_, columns = ['trend', 'season', 'periods', 'bias', 'method'])
hyperparam_['mse'] = hyperparam_.swifter.apply(lambda x: GSESOPT(seriesDeseasonal.copy(), 'seriesDeseasonal',\
x.trend, x.season, x.periods, x.bias, x.method, calcPredTestseriesDeseasonal),\
axis = 1)
display(hyperparam_.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
model_kwargs = dict(trend = 'add', seasonal = 'add', seasonal_periods = FEATURES, damped_trend = True)
fit_kwargs = dict(smoothing_level = 0.1, smoothing_trend = 0.2, smoothing_seasonal = 0.5, method = 'ls', damping_trend = 0.8)
reportES(seriesDeseasonal.copy(), 'Deseasonal', model_kwargs, fit_kwargs, calcPredTestseriesDeseasonal)
%%time
if EXECUTE_GRID_SEARCH:
alphas = betas = gammas = damping_trend = np.arange(1, step = 0.1)
hyperparam = pd.DataFrame(product(alphas, betas, gammas, damping_trend), columns = ['alpha', 'beta', 'gamma', 'damping_trend'])
hyperparam['mse'] = hyperparam.swifter.apply(lambda x: GSES(seriesBoxCox.copy(), 'BoxCox',\
x.alpha, x.beta, x.gamma, x.damping_trend, calcPredTestBoxCox), axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
%%time
if EXECUTE_GRID_SEARCH:
params_ = exp_smoothing_configs([FEATURES])
hyperparam_ = pd.DataFrame(params_, columns = ['trend', 'season', 'periods', 'bias', 'method'])
hyperparam_['mse'] = hyperparam_.swifter.apply(lambda x: GSESOPT(seriesBoxCox.copy(), 'BoxCox',\
x.trend, x.season, x.periods, x.bias, x.method, calcPredTestBoxCox),\
axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
model_kwargs = dict(trend = 'add', seasonal = 'add', seasonal_periods = FEATURES, damped_trend = True)
fit_kwargs = dict(smoothing_level = 0.1, smoothing_trend = 0.7, smoothing_seasonal = 0.0, method = 'ls', damping_trend = 0.8)
reportES(seriesBoxCox.copy(), 'BoxCox', model_kwargs, fit_kwargs, calcPredTestBoxCox)
%%time
if EXECUTE_GRID_SEARCH:
alphas = betas = gammas = damping_trend = np.arange(1, step = 0.1)
hyperparam = pd.DataFrame(product(alphas, betas, gammas, damping_trend), columns = ['alpha', 'beta', 'gamma', 'damping_trend'])
hyperparam['mse'] = hyperparam.swifter.apply(lambda x: GSES(seriesResidual.copy(), 'Stationary',\
x.alpha, x.beta, x.gamma, x.damping_trend, calcPredTestStationary), axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
%%time
if EXECUTE_GRID_SEARCH:
params_ = exp_smoothing_configs([FEATURES])
hyperparam_ = pd.DataFrame(params_, columns = ['trend', 'season', 'periods', 'bias', 'method'])
hyperparam_['mse'] = hyperparam_.swifter.apply(lambda x: GSESOPT(seriesResidual.copy(), 'Stationary',\
x.trend, x.season, x.periods, x.bias, x.method, calcPredTestStationary),\
axis = 1)
display(hyperparam_.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
model_kwargs = dict(trend = 'add', seasonal = 'add', seasonal_periods = FEATURES, damped_trend = True)
fit_kwargs = dict(smoothing_level = 0.0, smoothing_trend = 0.2, smoothing_seasonal = 0.1, method = 'ls', damping_trend = 0.2)
reportES(seriesResidual.copy(), 'Stationary', model_kwargs, fit_kwargs, calcPredTestStationary)
result
| Algorithm | MSE | RMSE | MAE | MAPE | Mean_True_Value | Mean_Predict_Value | |
|---|---|---|---|---|---|---|---|
| 0 | Original - Time Series Regression | 5296.329792 | 60.784585 | 42.960631 | 25.553663 | 179.560484 | 168.008596 |
| 1 | Deseasonal - Time Series Regression | 5081.77776 | 58.208746 | 40.122247 | 23.488563 | 179.560484 | 164.670085 |
| 2 | BoxCox - Time Series Regression | 5328.602465 | 61.060147 | 43.254815 | 25.416806 | 179.560484 | 165.399796 |
| 3 | Stationary - Time Series Regression | 6057.828087 | 64.323443 | 46.538896 | 27.899678 | 179.560484 | 184.267746 |
| 4 | Original - Exponential Smoothing | 4749.16575 | 55.864659 | 38.516078 | 22.095183 | 179.560484 | 165.919136 |
| 5 | Deseasonal - Exponential Smoothing | 4774.382728 | 55.5275 | 37.488666 | 21.8423 | 179.560484 | 166.749778 |
| 6 | BoxCox - Exponential Smoothing | 4653.246578 | 54.946508 | 37.555881 | 21.63599 | 179.560484 | 166.093056 |
| 7 | Stationary - Exponential Smoothing | 5658.627242 | 63.86568 | 43.658278 | 26.080376 | 179.560484 | 179.444886 |
# Report do algoritmo arima, também é adicionado a base de resultados e realizado o plot de acompanhamento
def reportArima(arimaModel, modelName, calcFunction):
global result
global figs
mse = []
rmse = []
mae = []
mape = []
mtv = []
mpv = []
title = modelName + ' - Auto Arima'
indexPlot = 0
for train_id, test_id in split_range.split(data):
train, test = data.iloc[train_id], data.iloc[test_id]
arimaModel.fit(train)
pred = arimaModel.predict(test.shape[0])
pred, test, indexPlot = calcFunction(train, pred, test)
mse, rmse, mae, mape, mtv, mpv = calculate_metrics_on(test, pred, mse, rmse, mae, mape, mtv, mpv)
result = record(result, title, np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(mape), np.mean(mtv), np.mean(mpv), False)
return plot(test.index, pred, mse, title, figs, axs[indexPlot], modelName)
# Reset da figura
figs, axs = plt.subplots(nrows = 4, sharex = True, figsize = (13,6))
figs.align_ylabels()
figs.tight_layout()
plt.close()
# Correlação entre os periodos com ARIMA
lags = 90
with catch_warnings():
filterwarnings('ignore')
fig, ax = plt.subplots(2, figsize = (12, 6), sharex = True)
plot_acf(seriesOriginal.diff().dropna(), ax = ax[0], lags = lags, missing = 'drop')
plot_pacf(seriesOriginal.diff().dropna(), ax = ax[1], lags = lags)
plt.show()
%%time
# Utilizando o auto arima para descobrir os argumentos ideias baseados no conjunto de dado informado
data = seriesOriginal.copy()
arimaModel = auto_arima(seriesOriginal.copy(), m = FEATURES, seasonal = True)
arimaModel
CPU times: total: 22.8 s Wall time: 22.5 s
ARIMA(order=(1, 1, 2), scoring_args={}, seasonal_order=(0, 0, 2, 7),
suppress_warnings=True)
reportArima(arimaModel, 'Original', calcPredTestOriginal)
%%time
data = seriesDeseasonal.copy()
arimaModel = auto_arima(data, m = FEATURES, seasonal = False)
arimaModel
C:\Users\herik\anaconda3\lib\site-packages\pmdarima\arima\_validation.py:62: UserWarning: m (7) set for non-seasonal fit. Setting to 0
CPU times: total: 2.53 s Wall time: 2.63 s
ARIMA(order=(2, 1, 1), scoring_args={}, suppress_warnings=True)
reportArima(arimaModel, 'Deseasonal', calcPredTestseriesDeseasonal)
%%time
data = seriesBoxCox.copy()
arimaModel = auto_arima(data, m = FEATURES, seasonal = True)
arimaModel
CPU times: total: 26.8 s Wall time: 26.9 s
ARIMA(order=(1, 1, 1), scoring_args={}, seasonal_order=(1, 0, 1, 7),
suppress_warnings=True)
reportArima(arimaModel, 'BoxCox', calcPredTestBoxCox)
%%time
data = seriesResidual.copy()
arimaModel = auto_arima(data, m = FEATURES, seasonal = False)
arimaModel
C:\Users\herik\anaconda3\lib\site-packages\pmdarima\arima\_validation.py:62: UserWarning: m (7) set for non-seasonal fit. Setting to 0
CPU times: total: 5.92 s Wall time: 6 s
ARIMA(order=(3, 0, 3), scoring_args={}, suppress_warnings=True,
with_intercept=False)
reportArima(arimaModel, 'Stationary', calcPredTestStationary)
result
| Algorithm | MSE | RMSE | MAE | MAPE | Mean_True_Value | Mean_Predict_Value | |
|---|---|---|---|---|---|---|---|
| 0 | Original - Time Series Regression | 5296.329792 | 60.784585 | 42.960631 | 25.553663 | 179.560484 | 168.008596 |
| 1 | Deseasonal - Time Series Regression | 5081.77776 | 58.208746 | 40.122247 | 23.488563 | 179.560484 | 164.670085 |
| 2 | BoxCox - Time Series Regression | 5328.602465 | 61.060147 | 43.254815 | 25.416806 | 179.560484 | 165.399796 |
| 3 | Stationary - Time Series Regression | 6057.828087 | 64.323443 | 46.538896 | 27.899678 | 179.560484 | 184.267746 |
| 4 | Original - Exponential Smoothing | 4749.16575 | 55.864659 | 38.516078 | 22.095183 | 179.560484 | 165.919136 |
| 5 | Deseasonal - Exponential Smoothing | 4774.382728 | 55.5275 | 37.488666 | 21.8423 | 179.560484 | 166.749778 |
| 6 | BoxCox - Exponential Smoothing | 4653.246578 | 54.946508 | 37.555881 | 21.63599 | 179.560484 | 166.093056 |
| 7 | Stationary - Exponential Smoothing | 5658.627242 | 63.86568 | 43.658278 | 26.080376 | 179.560484 | 179.444886 |
| 8 | Original - Auto Arima | 11599.889603 | 89.225015 | 70.61781 | 42.311848 | 179.560484 | 194.668305 |
| 9 | Deseasonal - Auto Arima | 9591.302944 | 77.051189 | 59.640244 | 35.253536 | 179.560484 | 191.568473 |
| 10 | BoxCox - Auto Arima | 11393.604917 | 84.409223 | 66.06974 | 39.346984 | 179.560484 | 206.496282 |
| 11 | Stationary - Auto Arima | 5675.808118 | 62.612956 | 43.570659 | 25.453974 | 179.560484 | 169.27759 |
# Redefinindo variaveis globais para o treino utilizando LSTM
BATCH_SIZE = 30
# hypertuning do algoritmo de LSTM
def GSLSTM(data, learning_rate, calcFunction):
mse = []
# Crossvalidation para cada parte do conjunto
for train_id, test_id in split_range.split(data):
train, test = data.iloc[train_id], data.iloc[test_id]
try:
with catch_warnings():
filterwarnings('ignore')
# Normalização e reshape do conjunto de treino
ss = StandardScaler()
ss.fit(train.values.reshape(-1, 1))
train_input = ss.transform(train.values.reshape(-1, 1))
# Gerando conjunto de treino com TimeseriesGenerator baseado no conjunto atual e o batch informado
test_input = train_input[-(FEATURES + 1):]
test_gen = TimeseriesGenerator(test_input, test_input, length = FEATURES, batch_size = BATCH_SIZE)
train_gen = TimeseriesGenerator(train_input, train_input, length = FEATURES, batch_size = BATCH_SIZE)
# Reset da sessão
clear_session()
# Construindo o modelo de LSTM com GlorotUniform pois inicializa de forma normalizada
initializer = GlorotUniform(seed = SEED)
model = Sequential()
# 1 camada de LSTM com 64 entradas, 2 camadas densas e uma de normalização intermediando as camadas densas
model.add(LSTM(64, input_shape = (FEATURES, 1), return_sequences = False))
model.add(Dense(1, kernel_initializer = initializer))
model.add(BatchNormalization())
model.add(Dense(1, kernel_initializer = initializer))
# Configurando o EarlyStopping para o modelo não treinar mais que 3x seguidas se não obtiver melhorias nos resultados
early_stopping = EarlyStopping(monitor = 'loss', patience = 3, mode = 'min')
# Treinando o modelo com otimizador Adam
model.compile(loss = 'mse', optimizer = Adam(learning_rate = learning_rate), metrics = ['mae'])
h = model.fit(train_gen, epochs = EPOCHS, callbacks = [early_stopping], verbose = False)
pred = np.empty(test.shape[0])
# Realizando predições no conjunto de teste
for i in range(len(pred)):
prediction = model.predict(test_gen, verbose = False)
pred[i] = prediction
test_input = np.delete(test_input, 0, 0)
test_input = np.concatenate((test_input, np.array(prediction).reshape(-1, 1)), axis = 0)
test_gen = TimeseriesGenerator(test_input, test_input, length = FEATURES, batch_size = BATCH_SIZE)
# Reorganizando o shape e chamando a função de calculo
pred = ss.inverse_transform(pred.reshape(-1,1)).reshape(-1)
pred, test, _ = calcFunction(train, pred, test)
mse.append(mean_squared_error(pred, test))
except:
mse.append(-1)
return np.mean(mse)
# Report do algoritmo LSTM
def reportLSTM(data, modelName, calcFunction, learning_rate):
global result
global figs
mse = []
rmse = []
mae = []
mape = []
mtv = []
mpv = []
title = modelName + ' - Long Short Term Memory (LSTM)'
# Crossvalidation para cada parte do conjunto
for train_id, test_id in split_range.split(data):
train, test = data.iloc[train_id], data.iloc[test_id]
# Normalização e reshape do conjunto de treino
ss = StandardScaler()
ss.fit(train.values.reshape(-1, 1))
train_input = ss.transform(train.values.reshape(-1, 1))
# Gerando conjunto de treino com TimeseriesGenerator baseado no conjunto atual e o batch informado
test_input = train_input[-(FEATURES + 1):]
test_gen = TimeseriesGenerator(test_input, test_input, length = FEATURES, batch_size = BATCH_SIZE)
train_gen = TimeseriesGenerator(train_input, train_input, length = FEATURES, batch_size = BATCH_SIZE)
# Reset da sessão
clear_session()
# Construindo o modelo de LSTM com GlorotUniform pois inicializa de forma normalizada
initializer = GlorotUniform(seed = SEED)
model = Sequential()
# 1 camada de LSTM com 64 entradas, 2 camadas densas e uma de normalização intermediando as camadas densas
model.add(LSTM(64, input_shape = (FEATURES, 1), return_sequences = False))
model.add(Dense(1, kernel_initializer = initializer))
model.add(BatchNormalization())
model.add(Dense(1, kernel_initializer = initializer))
# Configurando o EarlyStopping para o modelo não treinar mais que 3x seguidas se não obtiver melhorias nos resultados
early_stopping = EarlyStopping(monitor = 'loss', patience = 3, mode = 'min')
# Treinando o modelo com otimizador Adam
model.compile(loss = 'mse', optimizer = Adam(learning_rate = learning_rate), metrics = ['mae'])
h = model.fit(train_gen, epochs = EPOCHS, callbacks = [early_stopping], verbose = False)
pred = np.empty(test.shape[0])
# Realizando predições no conjunto de teste
for i in range(len(pred)):
prediction = model.predict(test_gen, verbose = False)
pred[i] = prediction
test_input = np.delete(test_input, 0, 0)
test_input = np.concatenate((test_input, np.array(prediction).reshape(-1, 1)), axis = 0)
test_gen = TimeseriesGenerator(test_input, test_input, length = FEATURES, batch_size = BATCH_SIZE)
# Reorganizando o shape e chamando a função de calculo
pred = ss.inverse_transform(pred.reshape(-1,1)).reshape(-1)
pred, test, indexPlot = calcFunction(train, pred, test)
mse, rmse, mae, mape, mtv, mpv = calculate_metrics_on(test, pred, mse, rmse, mae, mape, mtv, mpv)
result = record(result, title, np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(mape), np.mean(mtv), np.mean(mpv), False)
return plot(test.index, pred, mse, title, figs, axs[indexPlot], modelName)
%%time
if EXECUTE_GRID_SEARCH:
# Gerando tabela de hypertunning com taxas de learning_rate
learning_rates = np.logspace(-5, 1, 7)
hyperparam = pd.DataFrame(learning_rates, columns = ['learning_rate'])
hyperparam['mse'] = hyperparam.swifter.apply(lambda x: GSLSTM(seriesOriginal.copy(), x.learning_rate, calcPredTestOriginal), axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
# Reset da figura
figs, axs = plt.subplots(nrows = 4, sharex = True, figsize = (13,6))
figs.align_ylabels()
figs.tight_layout()
plt.close()
reportLSTM(seriesOriginal.copy(), 'Original', calcPredTestOriginal, 0.0001)
%%time
if EXECUTE_GRID_SEARCH:
learning_rates = np.logspace(-5, 1, 7)
hyperparam = pd.DataFrame(learning_rates, columns = ['learning_rate'])
hyperparam['mse'] = hyperparam.swifter.apply(lambda x: GSLSTM(seriesDeseasonal.copy(), x.learning_rate, calcPredTestseriesDeseasonal), axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 0 ns
reportLSTM(seriesDeseasonal.copy(), 'Deseasonal', calcPredTestseriesDeseasonal, 0.01)
%%time
if EXECUTE_GRID_SEARCH:
learning_rates = np.logspace(-5, 1, 7)
hyperparam = pd.DataFrame(learning_rates, columns = ['learning_rate'])
hyperparam['mse'] = hyperparam.swifter.apply(lambda x: GSLSTM(seriesBoxCox.copy(), x.learning_rate, calcPredTestBoxCox), axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 999 µs
reportLSTM(seriesBoxCox.copy(), 'BoxCox', calcPredTestBoxCox, 0.001)
%%time
if EXECUTE_GRID_SEARCH:
learning_rates = np.logspace(-5, 1, 7)
hyperparam = pd.DataFrame(learning_rates, columns = ['learning_rate'])
hyperparam['mse'] = hyperparam.swifter.apply(lambda x: GSLSTM(seriesResidual.copy(), x.learning_rate, calcPredTestStationary), axis = 1)
display(hyperparam.query('mse == mse.min() and mse != -1'))
CPU times: total: 0 ns Wall time: 1 ms
reportLSTM(seriesResidual.copy(), 'Stationary', calcPredTestStationary, 0.00001)
result
| Algorithm | MSE | RMSE | MAE | MAPE | Mean_True_Value | Mean_Predict_Value | |
|---|---|---|---|---|---|---|---|
| 0 | Original - Time Series Regression | 5296.329792 | 60.784585 | 42.960631 | 25.553663 | 179.560484 | 168.008596 |
| 1 | Deseasonal - Time Series Regression | 5081.77776 | 58.208746 | 40.122247 | 23.488563 | 179.560484 | 164.670085 |
| 2 | BoxCox - Time Series Regression | 5328.602465 | 61.060147 | 43.254815 | 25.416806 | 179.560484 | 165.399796 |
| 3 | Stationary - Time Series Regression | 6057.828087 | 64.323443 | 46.538896 | 27.899678 | 179.560484 | 184.267746 |
| 4 | Original - Exponential Smoothing | 4749.16575 | 55.864659 | 38.516078 | 22.095183 | 179.560484 | 165.919136 |
| 5 | Deseasonal - Exponential Smoothing | 4774.382728 | 55.5275 | 37.488666 | 21.8423 | 179.560484 | 166.749778 |
| 6 | BoxCox - Exponential Smoothing | 4653.246578 | 54.946508 | 37.555881 | 21.63599 | 179.560484 | 166.093056 |
| 7 | Stationary - Exponential Smoothing | 5658.627242 | 63.86568 | 43.658278 | 26.080376 | 179.560484 | 179.444886 |
| 8 | Original - Auto Arima | 11599.889603 | 89.225015 | 70.61781 | 42.311848 | 179.560484 | 194.668305 |
| 9 | Deseasonal - Auto Arima | 9591.302944 | 77.051189 | 59.640244 | 35.253536 | 179.560484 | 191.568473 |
| 10 | BoxCox - Auto Arima | 11393.604917 | 84.409223 | 66.06974 | 39.346984 | 179.560484 | 206.496282 |
| 11 | Stationary - Auto Arima | 5675.808118 | 62.612956 | 43.570659 | 25.453974 | 179.560484 | 169.27759 |
| 12 | Original - Long Short Term Memory (LSTM) | 9381.265495 | 84.147816 | 61.880612 | 38.23957 | 179.560484 | 188.430568 |
| 13 | Deseasonal - Long Short Term Memory (LSTM) | 5064.39062 | 57.410273 | 39.893818 | 23.371864 | 179.560484 | 164.653074 |
| 14 | BoxCox - Long Short Term Memory (LSTM) | 5325.842935 | 63.086576 | 44.862384 | 25.808698 | 179.560484 | 159.906651 |
| 15 | Stationary - Long Short Term Memory (LSTM) | 8713.79911 | 76.748656 | 57.541623 | 34.481319 | 179.560484 | 202.934807 |
def create_features(df, label = None):
# Separando variaveis temporais
df['date'] = df.index
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.weekofyear
X = df[['hour','dayofweek','quarter','month','year',
'dayofyear','dayofmonth','weekofyear']]
if label:
y = df[label]
return X, y
return X
# Convertendo dtHistory para serial
dtHistoryIndexado = dtHistory.copy()
dtHistoryIndexado.index = dtHistoryIndexado['Data']
dtHistoryIndexado = dtHistoryIndexado['Vendas']
# Separando base de treino e teste
split_date = '2018-06-16'
train = dtHistoryIndexado.loc[dtHistoryIndexado.index <= split_date].copy()
test = dtHistoryIndexado.loc[dtHistoryIndexado.index > split_date].copy()
# Convertendo base de treio e teste para dataframe
train = pd.DataFrame(train)
test = pd.DataFrame(test)
# Separando X e y em treino e teste
X_train, y_train = create_features(train, label = 'Vendas')
X_test, y_test = create_features(test, label = 'Vendas')
# Treinando modelo
model = XGBRegressor(n_estimators = 1000, early_stopping_rounds = 10, objective = 'reg:squarederror')
model.fit(X_train, y_train,
eval_set = [(X_train, y_train)],
verbose = True)
[0] validation_0-rmse:128.20663 [1] validation_0-rmse:94.86369 [2] validation_0-rmse:71.45047 [3] validation_0-rmse:54.90688 [4] validation_0-rmse:43.03469 [5] validation_0-rmse:34.67400 [6] validation_0-rmse:28.38199 [7] validation_0-rmse:23.87289 [8] validation_0-rmse:20.62290 [9] validation_0-rmse:18.05469 [10] validation_0-rmse:16.07627 [11] validation_0-rmse:14.50070 [12] validation_0-rmse:13.32707 [13] validation_0-rmse:12.46696 [14] validation_0-rmse:11.73353 [15] validation_0-rmse:11.21462 [16] validation_0-rmse:10.89039 [17] validation_0-rmse:10.55919 [18] validation_0-rmse:10.25021 [19] validation_0-rmse:10.05513 [20] validation_0-rmse:9.81996 [21] validation_0-rmse:9.37028 [22] validation_0-rmse:9.13984 [23] validation_0-rmse:9.05815 [24] validation_0-rmse:8.90733 [25] validation_0-rmse:8.58755 [26] validation_0-rmse:8.32837 [27] validation_0-rmse:8.09972 [28] validation_0-rmse:7.90257 [29] validation_0-rmse:7.56035 [30] validation_0-rmse:7.28918 [31] validation_0-rmse:7.11743 [32] validation_0-rmse:7.05787 [33] validation_0-rmse:6.81881 [34] validation_0-rmse:6.64004 [35] validation_0-rmse:6.55754 [36] validation_0-rmse:6.44605 [37] validation_0-rmse:6.39392 [38] validation_0-rmse:6.30512 [39] validation_0-rmse:6.12802 [40] validation_0-rmse:6.04128 [41] validation_0-rmse:5.89725 [42] validation_0-rmse:5.75833 [43] validation_0-rmse:5.65882 [44] validation_0-rmse:5.45320 [45] validation_0-rmse:5.43852 [46] validation_0-rmse:5.30536 [47] validation_0-rmse:5.23324 [48] validation_0-rmse:5.16132 [49] validation_0-rmse:4.98583 [50] validation_0-rmse:4.90491 [51] validation_0-rmse:4.85728 [52] validation_0-rmse:4.75022 [53] validation_0-rmse:4.62811 [54] validation_0-rmse:4.51488 [55] validation_0-rmse:4.43978 [56] validation_0-rmse:4.39684 [57] validation_0-rmse:4.31282 [58] validation_0-rmse:4.26165 [59] validation_0-rmse:4.15069 [60] validation_0-rmse:4.11449 [61] validation_0-rmse:3.90410 [62] validation_0-rmse:3.87873 [63] validation_0-rmse:3.73529 [64] validation_0-rmse:3.62709 [65] validation_0-rmse:3.54595 [66] validation_0-rmse:3.49228 [67] validation_0-rmse:3.39021 [68] validation_0-rmse:3.33934 [69] validation_0-rmse:3.26861 [70] validation_0-rmse:3.23306 [71] validation_0-rmse:3.20760 [72] validation_0-rmse:3.14395 [73] validation_0-rmse:3.09425 [74] validation_0-rmse:3.06099 [75] validation_0-rmse:2.98244 [76] validation_0-rmse:2.95662 [77] validation_0-rmse:2.92287 [78] validation_0-rmse:2.88777 [79] validation_0-rmse:2.85063 [80] validation_0-rmse:2.83178 [81] validation_0-rmse:2.74686 [82] validation_0-rmse:2.70909 [83] validation_0-rmse:2.69516 [84] validation_0-rmse:2.64085 [85] validation_0-rmse:2.60230 [86] validation_0-rmse:2.54686 [87] validation_0-rmse:2.51569 [88] validation_0-rmse:2.46884 [89] validation_0-rmse:2.46565 [90] validation_0-rmse:2.44668 [91] validation_0-rmse:2.43844 [92] validation_0-rmse:2.43554 [93] validation_0-rmse:2.39080 [94] validation_0-rmse:2.32231 [95] validation_0-rmse:2.29849 [96] validation_0-rmse:2.24401 [97] validation_0-rmse:2.23769 [98] validation_0-rmse:2.22102 [99] validation_0-rmse:2.14938 [100] validation_0-rmse:2.11655 [101] validation_0-rmse:2.04486 [102] validation_0-rmse:2.00916 [103] validation_0-rmse:1.99977 [104] validation_0-rmse:1.97283 [105] validation_0-rmse:1.96337 [106] validation_0-rmse:1.93909 [107] validation_0-rmse:1.88404 [108] validation_0-rmse:1.85057 [109] validation_0-rmse:1.82730 [110] validation_0-rmse:1.77147 [111] validation_0-rmse:1.72463 [112] validation_0-rmse:1.67492 [113] validation_0-rmse:1.64281 [114] validation_0-rmse:1.59936 [115] validation_0-rmse:1.58977 [116] validation_0-rmse:1.57191 [117] validation_0-rmse:1.52187 [118] validation_0-rmse:1.49802 [119] validation_0-rmse:1.45115 [120] validation_0-rmse:1.41551 [121] validation_0-rmse:1.38633 [122] validation_0-rmse:1.34686 [123] validation_0-rmse:1.33484 [124] validation_0-rmse:1.31959 [125] validation_0-rmse:1.29124 [126] validation_0-rmse:1.26931 [127] validation_0-rmse:1.24123 [128] validation_0-rmse:1.23362 [129] validation_0-rmse:1.22228 [130] validation_0-rmse:1.18447 [131] validation_0-rmse:1.15726 [132] validation_0-rmse:1.14962 [133] validation_0-rmse:1.14116 [134] validation_0-rmse:1.11896 [135] validation_0-rmse:1.11442 [136] validation_0-rmse:1.10583 [137] validation_0-rmse:1.09802 [138] validation_0-rmse:1.09044 [139] validation_0-rmse:1.07587 [140] validation_0-rmse:1.05242 [141] validation_0-rmse:1.02155 [142] validation_0-rmse:0.99953 [143] validation_0-rmse:0.99340 [144] validation_0-rmse:0.98261 [145] validation_0-rmse:0.96142 [146] validation_0-rmse:0.94268 [147] validation_0-rmse:0.92258 [148] validation_0-rmse:0.90560 [149] validation_0-rmse:0.89752 [150] validation_0-rmse:0.88387 [151] validation_0-rmse:0.86778 [152] validation_0-rmse:0.85894 [153] validation_0-rmse:0.84767 [154] validation_0-rmse:0.83853 [155] validation_0-rmse:0.81243 [156] validation_0-rmse:0.80764 [157] validation_0-rmse:0.79768 [158] validation_0-rmse:0.78877 [159] validation_0-rmse:0.76749 [160] validation_0-rmse:0.76348 [161] validation_0-rmse:0.76215 [162] validation_0-rmse:0.75333 [163] validation_0-rmse:0.74584 [164] validation_0-rmse:0.73255 [165] validation_0-rmse:0.73160 [166] validation_0-rmse:0.72770 [167] validation_0-rmse:0.71881 [168] validation_0-rmse:0.71656 [169] validation_0-rmse:0.71298 [170] validation_0-rmse:0.68512 [171] validation_0-rmse:0.67267 [172] validation_0-rmse:0.65677 [173] validation_0-rmse:0.64756 [174] validation_0-rmse:0.63954 [175] validation_0-rmse:0.63470 [176] validation_0-rmse:0.62832 [177] validation_0-rmse:0.61528 [178] validation_0-rmse:0.60661 [179] validation_0-rmse:0.59926 [180] validation_0-rmse:0.59806 [181] validation_0-rmse:0.58996 [182] validation_0-rmse:0.57565 [183] validation_0-rmse:0.56138 [184] validation_0-rmse:0.55836 [185] validation_0-rmse:0.55666 [186] validation_0-rmse:0.54100 [187] validation_0-rmse:0.53053 [188] validation_0-rmse:0.50536 [189] validation_0-rmse:0.50004 [190] validation_0-rmse:0.48929 [191] validation_0-rmse:0.48566 [192] validation_0-rmse:0.47499 [193] validation_0-rmse:0.46781 [194] validation_0-rmse:0.46620 [195] validation_0-rmse:0.45854 [196] validation_0-rmse:0.44847 [197] validation_0-rmse:0.44670 [198] validation_0-rmse:0.44009 [199] validation_0-rmse:0.43690 [200] validation_0-rmse:0.43205 [201] validation_0-rmse:0.42523 [202] validation_0-rmse:0.42301 [203] validation_0-rmse:0.42193 [204] validation_0-rmse:0.41781 [205] validation_0-rmse:0.41218 [206] validation_0-rmse:0.41109 [207] validation_0-rmse:0.40660 [208] validation_0-rmse:0.39859 [209] validation_0-rmse:0.39720 [210] validation_0-rmse:0.39523 [211] validation_0-rmse:0.39371 [212] validation_0-rmse:0.38426 [213] validation_0-rmse:0.38314 [214] validation_0-rmse:0.38033 [215] validation_0-rmse:0.37524 [216] validation_0-rmse:0.36696 [217] validation_0-rmse:0.35834 [218] validation_0-rmse:0.35602 [219] validation_0-rmse:0.35071 [220] validation_0-rmse:0.34955 [221] validation_0-rmse:0.34639 [222] validation_0-rmse:0.33860 [223] validation_0-rmse:0.33754 [224] validation_0-rmse:0.33126 [225] validation_0-rmse:0.32824 [226] validation_0-rmse:0.31377 [227] validation_0-rmse:0.31023 [228] validation_0-rmse:0.30859 [229] validation_0-rmse:0.30134 [230] validation_0-rmse:0.29540 [231] validation_0-rmse:0.29346 [232] validation_0-rmse:0.28856 [233] validation_0-rmse:0.28304 [234] validation_0-rmse:0.28267 [235] validation_0-rmse:0.28002 [236] validation_0-rmse:0.27556 [237] validation_0-rmse:0.27532 [238] validation_0-rmse:0.27343 [239] validation_0-rmse:0.26832 [240] validation_0-rmse:0.26565 [241] validation_0-rmse:0.26521 [242] validation_0-rmse:0.26318 [243] validation_0-rmse:0.26253 [244] validation_0-rmse:0.26147 [245] validation_0-rmse:0.25817 [246] validation_0-rmse:0.25488 [247] validation_0-rmse:0.24893 [248] validation_0-rmse:0.24595 [249] validation_0-rmse:0.23796 [250] validation_0-rmse:0.23426 [251] validation_0-rmse:0.23018 [252] validation_0-rmse:0.22547 [253] validation_0-rmse:0.22338 [254] validation_0-rmse:0.21877 [255] validation_0-rmse:0.21275 [256] validation_0-rmse:0.20991 [257] validation_0-rmse:0.20877 [258] validation_0-rmse:0.20597 [259] validation_0-rmse:0.20474 [260] validation_0-rmse:0.20149 [261] validation_0-rmse:0.20013 [262] validation_0-rmse:0.19902 [263] validation_0-rmse:0.19648 [264] validation_0-rmse:0.19528 [265] validation_0-rmse:0.19469 [266] validation_0-rmse:0.18881 [267] validation_0-rmse:0.18798 [268] validation_0-rmse:0.18735 [269] validation_0-rmse:0.18524 [270] validation_0-rmse:0.18483 [271] validation_0-rmse:0.18418 [272] validation_0-rmse:0.17883 [273] validation_0-rmse:0.17609 [274] validation_0-rmse:0.17414 [275] validation_0-rmse:0.16952 [276] validation_0-rmse:0.16754 [277] validation_0-rmse:0.16581 [278] validation_0-rmse:0.16290 [279] validation_0-rmse:0.16119 [280] validation_0-rmse:0.15860 [281] validation_0-rmse:0.15652 [282] validation_0-rmse:0.15257 [283] validation_0-rmse:0.15240 [284] validation_0-rmse:0.15080 [285] validation_0-rmse:0.14820 [286] validation_0-rmse:0.14598 [287] validation_0-rmse:0.14231 [288] validation_0-rmse:0.14112 [289] validation_0-rmse:0.14066 [290] validation_0-rmse:0.14046 [291] validation_0-rmse:0.14019 [292] validation_0-rmse:0.13967 [293] validation_0-rmse:0.13944 [294] validation_0-rmse:0.13932 [295] validation_0-rmse:0.13910 [296] validation_0-rmse:0.13768 [297] validation_0-rmse:0.13250 [298] validation_0-rmse:0.13058 [299] validation_0-rmse:0.12879 [300] validation_0-rmse:0.12816 [301] validation_0-rmse:0.12526 [302] validation_0-rmse:0.12470 [303] validation_0-rmse:0.12304 [304] validation_0-rmse:0.12117 [305] validation_0-rmse:0.12036 [306] validation_0-rmse:0.11949 [307] validation_0-rmse:0.11591 [308] validation_0-rmse:0.11018 [309] validation_0-rmse:0.10918 [310] validation_0-rmse:0.10708 [311] validation_0-rmse:0.10631 [312] validation_0-rmse:0.10267 [313] validation_0-rmse:0.09755 [314] validation_0-rmse:0.09701 [315] validation_0-rmse:0.09539 [316] validation_0-rmse:0.09414 [317] validation_0-rmse:0.09252 [318] validation_0-rmse:0.09237 [319] validation_0-rmse:0.09188 [320] validation_0-rmse:0.08930 [321] validation_0-rmse:0.08898 [322] validation_0-rmse:0.08630 [323] validation_0-rmse:0.08493 [324] validation_0-rmse:0.08361 [325] validation_0-rmse:0.08228 [326] validation_0-rmse:0.08167 [327] validation_0-rmse:0.08051 [328] validation_0-rmse:0.08005 [329] validation_0-rmse:0.07693 [330] validation_0-rmse:0.07527 [331] validation_0-rmse:0.07502 [332] validation_0-rmse:0.07492 [333] validation_0-rmse:0.07485 [334] validation_0-rmse:0.07478 [335] validation_0-rmse:0.07251 [336] validation_0-rmse:0.07063 [337] validation_0-rmse:0.07035 [338] validation_0-rmse:0.06866 [339] validation_0-rmse:0.06803 [340] validation_0-rmse:0.06754 [341] validation_0-rmse:0.06549 [342] validation_0-rmse:0.06495 [343] validation_0-rmse:0.06397 [344] validation_0-rmse:0.06346 [345] validation_0-rmse:0.06309 [346] validation_0-rmse:0.06181 [347] validation_0-rmse:0.05966 [348] validation_0-rmse:0.05961 [349] validation_0-rmse:0.05913 [350] validation_0-rmse:0.05908 [351] validation_0-rmse:0.05724 [352] validation_0-rmse:0.05584 [353] validation_0-rmse:0.05480 [354] validation_0-rmse:0.05341 [355] validation_0-rmse:0.05152 [356] validation_0-rmse:0.05142 [357] validation_0-rmse:0.04994 [358] validation_0-rmse:0.04913 [359] validation_0-rmse:0.04893 [360] validation_0-rmse:0.04669 [361] validation_0-rmse:0.04623 [362] validation_0-rmse:0.04555 [363] validation_0-rmse:0.04537 [364] validation_0-rmse:0.04502 [365] validation_0-rmse:0.04491 [366] validation_0-rmse:0.04369 [367] validation_0-rmse:0.04288 [368] validation_0-rmse:0.04263 [369] validation_0-rmse:0.04254 [370] validation_0-rmse:0.04199 [371] validation_0-rmse:0.04192 [372] validation_0-rmse:0.04156 [373] validation_0-rmse:0.04115 [374] validation_0-rmse:0.04036 [375] validation_0-rmse:0.03985 [376] validation_0-rmse:0.03915 [377] validation_0-rmse:0.03867 [378] validation_0-rmse:0.03737 [379] validation_0-rmse:0.03700 [380] validation_0-rmse:0.03615 [381] validation_0-rmse:0.03567 [382] validation_0-rmse:0.03546 [383] validation_0-rmse:0.03517 [384] validation_0-rmse:0.03497 [385] validation_0-rmse:0.03488 [386] validation_0-rmse:0.03449 [387] validation_0-rmse:0.03444 [388] validation_0-rmse:0.03401 [389] validation_0-rmse:0.03347 [390] validation_0-rmse:0.03331 [391] validation_0-rmse:0.03306 [392] validation_0-rmse:0.03244 [393] validation_0-rmse:0.03195 [394] validation_0-rmse:0.03166 [395] validation_0-rmse:0.03155 [396] validation_0-rmse:0.03108 [397] validation_0-rmse:0.03083 [398] validation_0-rmse:0.03009 [399] validation_0-rmse:0.02956 [400] validation_0-rmse:0.02938 [401] validation_0-rmse:0.02897 [402] validation_0-rmse:0.02880 [403] validation_0-rmse:0.02861 [404] validation_0-rmse:0.02858 [405] validation_0-rmse:0.02849 [406] validation_0-rmse:0.02845 [407] validation_0-rmse:0.02781 [408] validation_0-rmse:0.02669 [409] validation_0-rmse:0.02659 [410] validation_0-rmse:0.02652 [411] validation_0-rmse:0.02592 [412] validation_0-rmse:0.02581 [413] validation_0-rmse:0.02554 [414] validation_0-rmse:0.02528 [415] validation_0-rmse:0.02488 [416] validation_0-rmse:0.02454 [417] validation_0-rmse:0.02443 [418] validation_0-rmse:0.02424 [419] validation_0-rmse:0.02407 [420] validation_0-rmse:0.02379 [421] validation_0-rmse:0.02356 [422] validation_0-rmse:0.02296 [423] validation_0-rmse:0.02281 [424] validation_0-rmse:0.02253 [425] validation_0-rmse:0.02148 [426] validation_0-rmse:0.02097 [427] validation_0-rmse:0.02095 [428] validation_0-rmse:0.02069 [429] validation_0-rmse:0.02037 [430] validation_0-rmse:0.02034 [431] validation_0-rmse:0.01970 [432] validation_0-rmse:0.01956 [433] validation_0-rmse:0.01954 [434] validation_0-rmse:0.01886 [435] validation_0-rmse:0.01867 [436] validation_0-rmse:0.01863 [437] validation_0-rmse:0.01852 [438] validation_0-rmse:0.01836 [439] validation_0-rmse:0.01826 [440] validation_0-rmse:0.01806 [441] validation_0-rmse:0.01784 [442] validation_0-rmse:0.01775 [443] validation_0-rmse:0.01768 [444] validation_0-rmse:0.01727 [445] validation_0-rmse:0.01703 [446] validation_0-rmse:0.01688 [447] validation_0-rmse:0.01682 [448] validation_0-rmse:0.01655 [449] validation_0-rmse:0.01630 [450] validation_0-rmse:0.01617 [451] validation_0-rmse:0.01595 [452] validation_0-rmse:0.01588 [453] validation_0-rmse:0.01585 [454] validation_0-rmse:0.01576 [455] validation_0-rmse:0.01568 [456] validation_0-rmse:0.01530 [457] validation_0-rmse:0.01513 [458] validation_0-rmse:0.01488 [459] validation_0-rmse:0.01475 [460] validation_0-rmse:0.01462 [461] validation_0-rmse:0.01439 [462] validation_0-rmse:0.01429 [463] validation_0-rmse:0.01394 [464] validation_0-rmse:0.01373 [465] validation_0-rmse:0.01366 [466] validation_0-rmse:0.01359 [467] validation_0-rmse:0.01352 [468] validation_0-rmse:0.01337 [469] validation_0-rmse:0.01322 [470] validation_0-rmse:0.01286 [471] validation_0-rmse:0.01252 [472] validation_0-rmse:0.01251 [473] validation_0-rmse:0.01237 [474] validation_0-rmse:0.01211 [475] validation_0-rmse:0.01178 [476] validation_0-rmse:0.01170 [477] validation_0-rmse:0.01162 [478] validation_0-rmse:0.01153 [479] validation_0-rmse:0.01137 [480] validation_0-rmse:0.01131 [481] validation_0-rmse:0.01119 [482] validation_0-rmse:0.01116 [483] validation_0-rmse:0.01113 [484] validation_0-rmse:0.01104 [485] validation_0-rmse:0.01103 [486] validation_0-rmse:0.01096 [487] validation_0-rmse:0.01092 [488] validation_0-rmse:0.01090 [489] validation_0-rmse:0.01068 [490] validation_0-rmse:0.01054 [491] validation_0-rmse:0.01020 [492] validation_0-rmse:0.00996 [493] validation_0-rmse:0.00988 [494] validation_0-rmse:0.00987 [495] validation_0-rmse:0.00985 [496] validation_0-rmse:0.00964 [497] validation_0-rmse:0.00951 [498] validation_0-rmse:0.00925 [499] validation_0-rmse:0.00905 [500] validation_0-rmse:0.00883 [501] validation_0-rmse:0.00858 [502] validation_0-rmse:0.00851 [503] validation_0-rmse:0.00839 [504] validation_0-rmse:0.00832 [505] validation_0-rmse:0.00823 [506] validation_0-rmse:0.00807 [507] validation_0-rmse:0.00802 [508] validation_0-rmse:0.00778 [509] validation_0-rmse:0.00775 [510] validation_0-rmse:0.00771 [511] validation_0-rmse:0.00768 [512] validation_0-rmse:0.00759 [513] validation_0-rmse:0.00756 [514] validation_0-rmse:0.00740 [515] validation_0-rmse:0.00738 [516] validation_0-rmse:0.00733 [517] validation_0-rmse:0.00722 [518] validation_0-rmse:0.00721 [519] validation_0-rmse:0.00720 [520] validation_0-rmse:0.00707 [521] validation_0-rmse:0.00695 [522] validation_0-rmse:0.00683 [523] validation_0-rmse:0.00674 [524] validation_0-rmse:0.00670 [525] validation_0-rmse:0.00669 [526] validation_0-rmse:0.00663 [527] validation_0-rmse:0.00662 [528] validation_0-rmse:0.00662 [529] validation_0-rmse:0.00659 [530] validation_0-rmse:0.00658 [531] validation_0-rmse:0.00655 [532] validation_0-rmse:0.00637 [533] validation_0-rmse:0.00635 [534] validation_0-rmse:0.00618 [535] validation_0-rmse:0.00616 [536] validation_0-rmse:0.00611 [537] validation_0-rmse:0.00598 [538] validation_0-rmse:0.00597 [539] validation_0-rmse:0.00589 [540] validation_0-rmse:0.00575 [541] validation_0-rmse:0.00568 [542] validation_0-rmse:0.00558 [543] validation_0-rmse:0.00557 [544] validation_0-rmse:0.00547 [545] validation_0-rmse:0.00544 [546] validation_0-rmse:0.00539 [547] validation_0-rmse:0.00533 [548] validation_0-rmse:0.00525 [549] validation_0-rmse:0.00511 [550] validation_0-rmse:0.00503 [551] validation_0-rmse:0.00485 [552] validation_0-rmse:0.00474 [553] validation_0-rmse:0.00464 [554] validation_0-rmse:0.00453 [555] validation_0-rmse:0.00444 [556] validation_0-rmse:0.00433 [557] validation_0-rmse:0.00433 [558] validation_0-rmse:0.00429 [559] validation_0-rmse:0.00420 [560] validation_0-rmse:0.00420 [561] validation_0-rmse:0.00414 [562] validation_0-rmse:0.00414 [563] validation_0-rmse:0.00414 [564] validation_0-rmse:0.00414 [565] validation_0-rmse:0.00414 [566] validation_0-rmse:0.00414 [567] validation_0-rmse:0.00414 [568] validation_0-rmse:0.00414 [569] validation_0-rmse:0.00414 [570] validation_0-rmse:0.00414
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=10, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=1000,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)
plot_importance(model, height = 1)
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>
test['Previsões'] = model.predict(X_test)
XGBresults = pd.concat([test, train], sort = False)
pred = test['Previsões']
test_ = test['Vendas']
mse, rmse, mae, mape, mtv, mpv = calculate_metrics_off(test_, pred)
result = record(result, 'Original - XGBoost', mse, rmse, mae, mape, mtv, mpv, False)
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = seriesOriginal.index,
y = seriesOriginal,
name = 'Real'),
go.Scatter(x = test.index,
y = test['Previsões'],
name = 'Previsto')]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Original - XGBoost')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.add_vrect(x0 = '2018-06-17', x1 = '2018-08-17',
annotation_text = 'Previsão base<br>de teste', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'orange', opacity = 0.2, line_width = 0)
pyoff.iplot(fig)
result
| Algorithm | MSE | RMSE | MAE | MAPE | Mean_True_Value | Mean_Predict_Value | |
|---|---|---|---|---|---|---|---|
| 0 | Original - Time Series Regression | 5296.329792 | 60.784585 | 42.960631 | 25.553663 | 179.560484 | 168.008596 |
| 1 | Deseasonal - Time Series Regression | 5081.77776 | 58.208746 | 40.122247 | 23.488563 | 179.560484 | 164.670085 |
| 2 | BoxCox - Time Series Regression | 5328.602465 | 61.060147 | 43.254815 | 25.416806 | 179.560484 | 165.399796 |
| 3 | Stationary - Time Series Regression | 6057.828087 | 64.323443 | 46.538896 | 27.899678 | 179.560484 | 184.267746 |
| 4 | Original - Exponential Smoothing | 4749.16575 | 55.864659 | 38.516078 | 22.095183 | 179.560484 | 165.919136 |
| 5 | Deseasonal - Exponential Smoothing | 4774.382728 | 55.5275 | 37.488666 | 21.8423 | 179.560484 | 166.749778 |
| 6 | BoxCox - Exponential Smoothing | 4653.246578 | 54.946508 | 37.555881 | 21.63599 | 179.560484 | 166.093056 |
| 7 | Stationary - Exponential Smoothing | 5658.627242 | 63.86568 | 43.658278 | 26.080376 | 179.560484 | 179.444886 |
| 8 | Original - Auto Arima | 11599.889603 | 89.225015 | 70.61781 | 42.311848 | 179.560484 | 194.668305 |
| 9 | Deseasonal - Auto Arima | 9591.302944 | 77.051189 | 59.640244 | 35.253536 | 179.560484 | 191.568473 |
| 10 | BoxCox - Auto Arima | 11393.604917 | 84.409223 | 66.06974 | 39.346984 | 179.560484 | 206.496282 |
| 11 | Stationary - Auto Arima | 5675.808118 | 62.612956 | 43.570659 | 25.453974 | 179.560484 | 169.27759 |
| 12 | Original - Long Short Term Memory (LSTM) | 9381.265495 | 84.147816 | 61.880612 | 38.23957 | 179.560484 | 188.430568 |
| 13 | Deseasonal - Long Short Term Memory (LSTM) | 5064.39062 | 57.410273 | 39.893818 | 23.371864 | 179.560484 | 164.653074 |
| 14 | BoxCox - Long Short Term Memory (LSTM) | 5325.842935 | 63.086576 | 44.862384 | 25.808698 | 179.560484 | 159.906651 |
| 15 | Stationary - Long Short Term Memory (LSTM) | 8713.79911 | 76.748656 | 57.541623 | 34.481319 | 179.560484 | 202.934807 |
| 16 | Original - XGBoost | 5021.661691 | 70.863684 | 61.2467 | 29.323271 | 221.854839 | 192.635712 |
# Tratando nomes e criando colunas de controle para os resultados gerados
topResult = (
result
.assign(Full_Name = lambda x: x.Algorithm.apply(lambda a: a.split('(')[0]
.replace('ARIMA', 'Auto Arima')
.replace('Long Short Term Memory', 'LSTM')))
.assign(Data_Category = lambda x: x.Algorithm.apply(lambda a: a.split(' - ')[0]))
.assign(Algorithm = lambda x: x.Algorithm.apply(lambda a: a.split(' - ')[1].split('(')[0]
.replace('ARIMA', 'Auto Arima')
.replace('Long Short Term Memory', 'LSTM')))
.sort_values('MSE')
)
topResult.head()
| Algorithm | MSE | RMSE | MAE | MAPE | Mean_True_Value | Mean_Predict_Value | Full_Name | Data_Category | |
|---|---|---|---|---|---|---|---|---|---|
| 6 | Exponential Smoothing | 4653.246578 | 54.946508 | 37.555881 | 21.63599 | 179.560484 | 166.093056 | BoxCox - Exponential Smoothing | BoxCox |
| 4 | Exponential Smoothing | 4749.16575 | 55.864659 | 38.516078 | 22.095183 | 179.560484 | 165.919136 | Original - Exponential Smoothing | Original |
| 5 | Exponential Smoothing | 4774.382728 | 55.5275 | 37.488666 | 21.8423 | 179.560484 | 166.749778 | Deseasonal - Exponential Smoothing | Deseasonal |
| 16 | XGBoost | 5021.661691 | 70.863684 | 61.2467 | 29.323271 | 221.854839 | 192.635712 | Original - XGBoost | Original |
| 13 | LSTM | 5064.39062 | 57.410273 | 39.893818 | 23.371864 | 179.560484 | 164.653074 | Deseasonal - LSTM | Deseasonal |
# Plot dos resultados obtidos por ordem ascendente do MSE
colors = {'Time Series Regression':'#1688F2',
'Exponential Smoothing':'#17FC2E',
'Auto Arima': '#E6C220',
'LSTM ': '#FC4417',
'XGBoost': '#A13BF5'}
# plotly figure
fig = go.Figure(layout = go.Layout(yaxis = {'type': 'category', 'title': 'Algoritmo e Categoria'},
xaxis = {'title': 'MSE'},
title = 'MSE por Algoritmo e Tipo de Dado'))
for t in topResult['Algorithm'].unique():
topResultFiltered = topResult[topResult['Algorithm']== t].copy()
fig.add_traces(go.Bar(x = topResultFiltered['MSE'], y = topResultFiltered['Full_Name'], name = str(t),\
marker_color = str(colors[t]), orientation = 'h',
text = round(topResultFiltered['MSE'].astype(np.double)), textposition = 'outside'))
fig.update_layout(yaxis = {'categoryorder':'total descending'}, autosize = False,
width = 1450,
height = 800)
fig.show()
pred_range = pd.date_range(datetime.date(2018, 6, 17), datetime.date(2018, 10, 31))
split_range = TimeSeriesSplit(n_splits = 2, max_train_size = pred_range.shape[0], test_size = pred_range.shape[0])
BATCH_SIZE = 1000
# Alocando melhor modelo a memória e separando base de treino
data = seriesBoxCox.copy()
train = data[datetime.date(2017, 1, 1): datetime.date(2018, 6, 16)]
# Treinando modelo baseado dos parâmetros descobertos na fase de modelagem
ES = (
ExponentialSmoothing(train, trend = 'add', seasonal = 'add', seasonal_periods = FEATURES, damped_trend = True)
.fit(smoothing_level = 0.1, smoothing_trend = 0.7, smoothing_seasonal = 0.0, method = 'ls', damping_trend = 0.8)
)
# Calculando a previsão até o final do ano de 2018
pred = ES.predict(str(data.index[0]), '2018-12-31')
pred, data, _ = calcPredTestBoxCox(train, pred, data)
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = seriesOriginal.index,
y = seriesOriginal,
name = 'Real'),
go.Scatter(x = pred.index,
y = pred,
name = 'Previsto')]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'BoxCox - Exponential Smoothing')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.add_vrect(x0 = '2018-06-17', x1 = '2018-08-17',
annotation_text = 'Previsão base<br>de teste', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'orange', opacity = 0.2, line_width = 0)
fig.add_vrect(x0 = '2018-08-17', x1 = '2018-12-31',
annotation_text = 'Projeção de<br>Vendas Futuras', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'green', opacity = 0.2, line_width = 0)
fig.update_layout(yaxis_range = [0 , 800])
pyoff.iplot(fig)
ES.summary()
| Dep. Variable: | None | No. Observations: | 532 |
|---|---|---|---|
| Model: | ExponentialSmoothing | SSE | 7800.497 |
| Optimized: | True | AIC | 1452.579 |
| Trend: | Additive | BIC | 1503.899 |
| Seasonal: | Additive | AICC | 1453.392 |
| Seasonal Periods: | 7 | Date: | Sun, 09 Oct 2022 |
| Box-Cox: | False | Time: | 13:54:28 |
| Box-Cox Coeff.: | None |
| coeff | code | optimized | |
|---|---|---|---|
| smoothing_level | 0.1000000 | alpha | False |
| smoothing_trend | 0.7000000 | beta | False |
| smoothing_seasonal | 0.000000 | gamma | False |
| initial_level | -5.0438933 | l.0 | True |
| initial_trend | 1.3021823 | b.0 | True |
| damping_trend | 0.8000000 | phi | False |
| initial_seasons.0 | 0.5899851 | s.0 | True |
| initial_seasons.1 | 4.8396236 | s.1 | True |
| initial_seasons.2 | 4.7464382 | s.2 | True |
| initial_seasons.3 | 4.2431176 | s.3 | True |
| initial_seasons.4 | 3.6034685 | s.4 | True |
| initial_seasons.5 | 2.5651724 | s.5 | True |
| initial_seasons.6 | -0.7669847 | s.6 | True |
# Alocando melhor modelo a memória e separando base de treino
data = seriesBoxCox.copy()
train = data[datetime.date(2017, 1, 1): datetime.date(2018, 6, 16)]
# Treinando modelo baseado dos parâmetros descobertos na fase de modelagem
ES = (
ExponentialSmoothing(train, trend = 'add', seasonal = 'add', seasonal_periods = FEATURES, damped_trend = True)
.fit(smoothing_level = 0.1, smoothing_trend = 0.7, smoothing_seasonal = 0.0, method = 'ls', damping_trend = 0.8)
)
# Calculando a previsão até o final do ano de 2018
pred = ES.predict(str(data.index[0]), '2018-08-17')
pred, data, _ = calcPredTestBoxCox(train, pred, data)
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = data.index,
y = data,
name = 'Real'),
go.Scatter(x = data.index,
y = pred,
name = 'Previsto', fill = "tonexty")]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Deseasonal - Exponential Smoothing')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.update_layout(yaxis_range = [0 , 800])
pyoff.iplot(fig)
tempIndex = pd.date_range(datetime.date(2017, 1, 1), datetime.date(2017, 11, 20))
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = tempIndex,
y = data,
name = 'Real'),
go.Scatter(x = tempIndex,
y = pred,
name = 'Previsto', fill = "tonexty")]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Deseasonal - Exponential Smoothing')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
tempIndex = pd.date_range(datetime.date(2017, 1, 1), datetime.date(2017, 12, 31))
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = tempIndex,
y = data,
name = 'Real'),
go.Scatter(x = tempIndex,
y = pred,
name = 'Previsto', fill = "tonexty")]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Deseasonal - Exponential Smoothing')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.update_layout(yaxis_range = [0 , 800])
pyoff.iplot(fig)
tempIndex = pd.date_range(datetime.date(2018, 1, 1), datetime.date(2018, 8, 17))
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = tempIndex,
y = data,
name = 'Real'),
go.Scatter(x = tempIndex,
y = pred,
name = 'Previsto', fill = "tonexty")]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Deseasonal - Exponential Smoothing')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.add_vrect(x0 = '2018-06-17', x1 = '2018-08-17',
annotation_text = 'Previsão base<br>de teste', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'orange', opacity = 0.2, line_width = 0)
pyoff.iplot(fig)
BATCH_SIZE = 30
# Definindo dataset
dtHistoryLSTM = pd.to_datetime(dtOrdersAdjusted['order_purchase_timestamp']).dt.date
# Separando periodo minimo e maximo
startLSTM = dtHistoryLSTM.min()
endLSTM = dtHistoryLSTM.max()
# Separando IDs das lacunas
idxLSTM = pd.date_range(startLSTM, endLSTM, normalize = True)
# Transformando dt para série e realizando contagem de valores diários
seriesOriginalLSTM = dtHistoryLSTM.value_counts(sort = False).sort_index().reindex(idxLSTM, fill_value = 0)
# Removendo outliers
seriesOriginalLSTM = seriesOriginalLSTM[datetime.date(2017, 1, 1): datetime.date(2018, 8, 17)]
# Adicionando predições futuras
newPredictions = pd.Series([0 for i in range(136)])
newPredictions.index = pd.date_range(datetime.date(2018, 8, 18), datetime.date(2018, 12, 31))
seriesOriginalTSTRPredictions = seriesOriginalLSTM.append(newPredictions)
# Gerando STL para separar o Deseasonal
stl = STL(seriesOriginalTSTRPredictions)
res = stl.fit()
# Separando Deseasonal
seriesOriginalTSTRPredictionsDeseasonal = res.observed - res.seasonal
# Separando index da série
dataTime = seriesOriginalTSTRPredictionsDeseasonal.copy().index
# Separando base de treino e teste
train_size = 532
test_size = len(seriesOriginalTSTRPredictionsDeseasonal) - train_size
train = seriesOriginalTSTRPredictionsDeseasonal[0:train_size]
test = seriesOriginalTSTRPredictionsDeseasonal[train_size:len(seriesOriginalTSTRPredictionsDeseasonal)]
# Normalização e reshape do conjunto de treino
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train.values.reshape(-1, 1))
train_input = scaler.transform(train.values.reshape(-1, 1))
# Gerando conjunto de treino com TimeseriesGenerator baseado no conjunto atual e o batch informado
test_input = train_input[-(FEATURES + 1):]
test_gen = TimeseriesGenerator(test_input, test_input, length = FEATURES, batch_size = BATCH_SIZE)
train_gen = TimeseriesGenerator(train_input, train_input, length = FEATURES, batch_size = BATCH_SIZE)
# Reset da sessão
clear_session()
# Construindo o modelo de LSTM com GlorotUniform pois inicializa de forma normalizada
initializer = GlorotUniform(seed = SEED)
model = Sequential()
# 1 camada de LSTM com 64 entradas, 2 camadas densas e uma de normalização intermediando as camadas densas
model.add(LSTM(64, input_shape = (FEATURES, 1), return_sequences = False))
model.add(Dense(1, kernel_initializer = initializer))
model.add(BatchNormalization())
model.add(Dense(1, kernel_initializer = initializer))
# Configurando o EarlyStopping para o modelo não treinar mais que 3x seguidas se não obtiver melhorias nos resultados
early_stopping = EarlyStopping(monitor = 'loss', patience = 3, mode = 'min')
# Treinando o modelo com otimizador Adam
model.compile(loss = 'mse', optimizer = Adam(learning_rate = 0.01), metrics = ['mae'])
model.fit(train_gen, epochs = 100, callbacks = [early_stopping], verbose = True)
pred = np.empty(test.shape[0])
# Realizando predições no conjunto de teste
for i in range(len(pred)):
prediction = model.predict(test_gen, verbose = False)
pred[i] = prediction
#print(test_gen[0][0], ' = ', prediction)
test_input = np.delete(test_input, 0, 0)
test_input = np.concatenate((test_input, np.array(prediction).reshape(-1, 1)), axis = 0)
test_gen = TimeseriesGenerator(test_input, test_input, length = FEATURES, batch_size = BATCH_SIZE)
# Reorganizando o shape e chamando a função de calculo
pred = scaler.inverse_transform(pred.reshape(-1,1)).reshape(-1)
pred, test, _ = calcPredTestseriesDeseasonal(train, pred, test)
Epoch 1/100 18/18 [==============================] - 2s 4ms/step - loss: 0.0131 - mae: 0.0827 Epoch 2/100 18/18 [==============================] - 0s 4ms/step - loss: 0.0096 - mae: 0.0789 Epoch 3/100 18/18 [==============================] - 0s 5ms/step - loss: 0.0071 - mae: 0.0633 Epoch 4/100 18/18 [==============================] - 0s 4ms/step - loss: 0.0069 - mae: 0.0667 Epoch 5/100 18/18 [==============================] - 0s 5ms/step - loss: 0.0067 - mae: 0.0657 Epoch 6/100 18/18 [==============================] - 0s 4ms/step - loss: 0.0067 - mae: 0.0655 Epoch 7/100 18/18 [==============================] - 0s 4ms/step - loss: 0.0065 - mae: 0.0645 Epoch 8/100 18/18 [==============================] - 0s 4ms/step - loss: 0.0072 - mae: 0.0656 Epoch 9/100 18/18 [==============================] - 0s 4ms/step - loss: 0.0069 - mae: 0.0670 Epoch 10/100 18/18 [==============================] - 0s 4ms/step - loss: 0.0073 - mae: 0.0670
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = dataTime,
y = seriesOriginalTSTRPredictionsDeseasonal[:594],
name = 'Real'),
go.Scatter(x = pd.date_range(datetime.date(2018, 6, 17), datetime.date(2018, 12, 31)),
y = pred,
name = 'Previsto')]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Deseasonal - LSTM')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.add_vrect(x0 = '2018-06-17', x1 = '2018-08-17',
annotation_text = 'Previsão base<br>de teste', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'orange', opacity = 0.2, line_width = 0)
fig.add_vrect(x0 = '2018-08-17', x1 = '2018-12-31',
annotation_text = 'Projeção de<br>Vendas Futuras', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'green', opacity = 0.2, line_width = 0)
fig.update_layout(yaxis_range = [0 , 800])
pyoff.iplot(fig)
BATCH_SIZE = 1000
# Definindo dataset
dtHistoryTSR = pd.to_datetime(dtOrdersAdjusted['order_purchase_timestamp']).dt.date
# Separando periodo minimo e maximo
startTSR = dtHistoryTSR.min()
endTSR = dtHistoryTSR.max()
# Separando IDs das lacunas
idxTSR = pd.date_range(startTSR, endTSR, normalize = True)
# Transformando dt para série e realizando contagem de valores diários
seriesOriginalTSR = dtHistoryTSR.value_counts(sort = False).sort_index().reindex(idxTSR, fill_value = 0)
# Removendo outliers
seriesOriginalTSR = seriesOriginalTSR[datetime.date(2017, 1, 1): datetime.date(2018, 8, 17)]
# Adicionando predições futuras
newPredictions = pd.Series([0 for i in range(136)])
newPredictions.index = pd.date_range(datetime.date(2018, 8, 18), datetime.date(2018, 12, 31))
seriesOriginalTSRRPredictions = seriesOriginalTSR.append(newPredictions)
# Gerando STL para separar o Deseasonal
stl = STL(seriesOriginalTSRRPredictions)
res = stl.fit()
# Separando Deseasonal
seriesOriginalTSRRPredictionsDeseasonal = res.observed - res.seasonal
# Separando base de treino e teste
train_size = 532
test_size = len(seriesOriginalTSRRPredictionsDeseasonal) - train_size
train = seriesOriginalTSRRPredictionsDeseasonal[0:train_size]
test = seriesOriginalTSRRPredictionsDeseasonal[train_size:len(seriesOriginalTSRRPredictionsDeseasonal)]
# Gerando generator de treino
gen = TimeseriesGenerator(train, train, FEATURES, batch_size = BATCH_SIZE)
X_train = gen[0][0]
y_train = gen[0][1]
# Treinando modelo
lr = LinearRegression()
lr.fit(X_train, y_train)
# Separando primeira leva de treino
X_pred = y_train[-FEATURES:].reshape(1,-1)
pred = np.empty(test.shape[0])
# Realizando predições e realocando vetor de entrada
for i in range(len(pred)):
forecast = lr.predict(X_pred)
X_pred = np.delete(X_pred, 0, 1)
X_pred = np.concatenate((X_pred, forecast.reshape(-1, 1)), 1)
pred[i] = forecast
# Convertendo deseasonal de volta para os valores originais
pred, test, _ = calcPredTestseriesDeseasonal(train, pred, test)
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = dataTime,
y = seriesOriginalTSTRPredictionsDeseasonal[:594],
name = 'Real'),
go.Scatter(x = pd.date_range(datetime.date(2018, 6, 17), datetime.date(2018, 12, 31)),
y = pred,
name = 'Previsto')]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Deseasonal - Time Series Regressor')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.add_vrect(x0 = '2018-06-17', x1 = '2018-08-17',
annotation_text = 'Previsão base<br>de teste', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'orange', opacity = 0.2, line_width = 0)
fig.add_vrect(x0 = '2018-08-17', x1 = '2018-12-31',
annotation_text = 'Projeção de<br>Vendas Futuras', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'green', opacity = 0.2, line_width = 0)
fig.update_layout(yaxis_range = [0 , 800])
pyoff.iplot(fig)
# Definindo dataset
dtHistoryArima = pd.to_datetime(dtOrdersAdjusted['order_purchase_timestamp']).dt.date
# Separando periodo minimo e maximo
startArima = dtHistoryArima.min()
endArima = dtHistoryArima.max()
# Separando IDs das lacunas
idxArima = pd.date_range(startArima, endArima, normalize = True)
# Transformando dt para série e realizando contagem de valores diários
seriesOriginalArima = dtHistoryArima.value_counts(sort = False).sort_index().reindex(idxArima, fill_value = 0)
# Removendo outliers
seriesOriginalArima = seriesOriginalArima[datetime.date(2017, 1, 1): datetime.date(2018, 8, 17)]
# Adicionando predições futuras
newPredictions = pd.Series([0 for i in range(136)])
newPredictions.index = pd.date_range(datetime.date(2018, 8, 18), datetime.date(2018, 12, 31))
seriesOriginalArimaPredictions = seriesOriginalArima.append(newPredictions)
# Gerando STL para separar cada um dos componentes
stl = STL(seriesOriginalArimaPredictions)
res = stl.fit()
# Separando boxcox
seriesBoxCoxArima, lmbda = boxcox(seriesOriginalArimaPredictions + 1)
seriesBoxCoxArima = pd.Series(seriesBoxCoxArima, index = seriesOriginalArimaPredictions.index)
# Separando base de treino e teste
train_size = 532
test_size = len(seriesBoxCoxArima) - train_size
train = seriesBoxCoxArima[0:train_size]
test = seriesBoxCoxArima[train_size:len(seriesBoxCoxArima)]
arimaModel = auto_arima(seriesBoxCoxArima, m = FEATURES, seasonal = True)
arimaModel
ARIMA(order=(1, 1, 1), scoring_args={}, seasonal_order=(1, 0, 2, 7),
suppress_warnings=True, with_intercept=False)
arimaModel.fit(train)
pred = arimaModel.predict(test.shape[0])
pred, test, _ = calcPredTestBoxCox(train, pred, test)
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = pd.date_range(datetime.date(2017, 1, 1), datetime.date(2018, 8, 17)),
y = seriesOriginalArima[:594],
name = 'Real'),
go.Scatter(x = pd.date_range(datetime.date(2018, 6, 17), datetime.date(2018, 12, 31)),
y = pred,
name = 'Previsto')]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'BoxCox - Arima')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.add_vrect(x0 = '2018-06-17', x1 = '2018-08-17',
annotation_text = 'Previsão base<br>de teste', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'orange', opacity = 0.2, line_width = 0)
fig.add_vrect(x0 = '2018-08-17', x1 = '2018-12-31',
annotation_text = 'Projeção de<br>Vendas Futuras', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'green', opacity = 0.2, line_width = 0)
fig.update_layout(yaxis_range = [0 , 800])
pyoff.iplot(fig)
# Convertendo dtHistory para serial
dtHistoryIndexado = dtHistory.copy()
dtHistoryIndexado.index = dtHistoryIndexado['Data']
dtHistoryIndexado = dtHistoryIndexado['Vendas']
# Separando base de treino e teste
split_date = '2018-06-16'
train = dtHistoryIndexado.loc[dtHistoryIndexado.index <= split_date].copy()
test = dtHistoryIndexado.loc[dtHistoryIndexado.index > split_date].copy()
# Convertendo base de treio e teste para dataframe
train = pd.DataFrame(train)
test = pd.DataFrame(test)
# Separando X e y em treino e teste
X_train, y_train = create_features(train, label = 'Vendas')
X_test, y_test = create_features(test, label = 'Vendas')
# Treinando modelo
model = XGBRegressor(n_estimators = 1000, early_stopping_rounds = 10, objective = 'reg:squarederror')
model.fit(X_train, y_train,
eval_set = [(X_train, y_train)],
verbose = False)
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=10, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=1000,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)
# Adicionando predições futuras
newPredictions = pd.Series([0 for i in range(136)])
newPredictions.index = pd.date_range(datetime.date(2018, 8, 18), datetime.date(2018, 12, 31))
newPredictions = pd.DataFrame(newPredictions)
newPredictions = create_features(newPredictions, label = None)
newPredictionsTest = pd.concat([X_test, newPredictions])
test = pd.concat([test, newPredictions])
test['Previsões'] = model.predict(newPredictionsTest)
XGBresults = pd.concat([test, train], sort = False)
# Plot
# Definição dos dados no plot
plot_data = [go.Scatter(x = seriesOriginal.index,
y = seriesOriginal,
name = 'Real'),
go.Scatter(x = test.index,
y = test['Previsões'],
name = 'Previsto')]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Período'},
yaxis = {'title': 'Vendas'},
title = 'Original - XGBoost')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
fig.add_vrect(x0 = '2018-06-17', x1 = '2018-08-17',
annotation_text = 'Previsão base<br>de teste', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'orange', opacity = 0.2, line_width = 0)
fig.add_vrect(x0 = '2018-08-17', x1 = '2018-12-31',
annotation_text = 'Projeção de<br>Vendas Futuras', annotation_position = 'top left',
annotation = dict(font_size = 23, font_family = 'Times New Roman'),
fillcolor = 'green', opacity = 0.2, line_width = 0)
fig.update_layout(yaxis_range = [0 , 800])
pyoff.iplot(fig)